In [1]:
import pylearn2.utils
import pylearn2.config
import theano
import neukrill_net.dense_dataset
import neukrill_net.utils
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import holoviews as hl
%load_ext holoviews.ipython
import sklearn.metrics
In [2]:
cd ..
In [61]:
import pylearn2.space
In [62]:
final_shape = (48,48)
In [63]:
input_space = pylearn2.space.Conv2DSpace(shape=final_shape,num_channels=1,axes=['b',0,1,'c'])
We're going to make a model that just has a single fully connected layer between input and output, so in Pylearn2 that's an MLP with a single rectified linear stage before the output layer. The weights are initialised by sampling a normal distribution with a standard deviation of 0.5. We assume that the input images will be standardised to have mean zero and variance 1.0
In [64]:
import pylearn2.models.mlp
In [65]:
main_mlp = pylearn2.models.mlp.MLP(
input_space=input_space,
batch_size=128,
layers=[pylearn2.models.mlp.RectifiedLinear(
dim=1024,
layer_name='h5',
istdev=0.5),
pylearn2.models.mlp.Softmax(
n_classes=121,
layer_name='y',
istdev=0.5
)])
In [53]:
import neukrill_net.image_directory_dataset
In [13]:
import neukrill_net.augment
import os
In [16]:
dataset = neukrill_net.image_directory_dataset.ListDataset(
transformer=neukrill_net.augment.RandomAugment(
units='float',
rotate=[0,90,180,270],
rotate_is_resizable=0,
flip=1,
resize=final_shape,
normalise={'global_or_pixel':'global',
'mu': 0.957,
'sigma': 0.142}
),
settings_path=os.path.abspath("settings.json"),
run_settings_path=os.path.abspath("run_settings/replicate_8aug.json"),
force=True
)
Testing this dataset:
In [17]:
iterator = dataset.iterator(mode='even_shuffled_sequential',batch_size=128)
In [18]:
X,y = iterator.next()
In [20]:
channels = None
for i in range(8):
if not channels:
channels = hl.Image(X[i,:].squeeze())
else:
channels += hl.Image(X[i,:].squeeze())
channels
Out[20]:
The rest of the train object stays the same, apart from the save path and that the algorithm will have to load one of these new ParallelDataset objects for its validation set. So, we're missing:
It's worth noting that when we define the cost and the weight decay we have to address the new convolutional layers inside the composite layer.
In this case, we're leaving out the cost function, as we want to see the model overfit before we choose to apply dropout. Also, we're limiting this model to 5 epochs to keep training times short.
In [54]:
import pylearn2.training_algorithms.sgd
import pylearn2.costs.mlp.dropout
import pylearn2.costs.cost
import pylearn2.termination_criteria
In [67]:
algorithm = pylearn2.training_algorithms.sgd.SGD(
train_iteration_mode='even_shuffled_sequential',
monitor_iteration_mode='even_sequential',
batch_size=128,
learning_rate=0.1,
learning_rule= pylearn2.training_algorithms.learning_rule.Momentum(
init_momentum=0.5
),
monitoring_dataset={
'train':dataset,
'valid':neukrill_net.image_directory_dataset.ListDataset(
transformer=neukrill_net.augment.RandomAugment(
units='float',
rotate=[0,90,180,270],
rotate_is_resizable=0,
flip=1,
resize=final_shape,
normalise={'global_or_pixel':'global',
'mu': 0.957,
'sigma': 0.142}
),
settings_path=os.path.abspath("settings.json"),
run_settings_path=os.path.abspath("run_settings/replicate_8aug.json"),
force=True, training_set_mode='validation'
)
},
cost=pylearn2.costs.mlp.Default(),
termination_criterion=pylearn2.termination_criteria.EpochCounter(max_epochs=5)
)
In [68]:
import pylearn2.train_extensions
import pylearn2.train_extensions.best_params
At this point, unlike before, we're initialising no extensions. After it seems like they're required, we'll put them back in.
In [69]:
import pylearn2.train
In [70]:
train = pylearn2.train.Train(
dataset=dataset,
model=main_mlp,
algorithm=algorithm,
save_path='/disk/scratch/neuroglycerin/models/iterative_design_recent.pkl',
save_freq=1
)
We can live with that warning.
Now, attempting to run the model:
In [71]:
%env THEANO_FLAGS='device=gpu2,floatX=float32,base_compiledir=~/.theano/stonesoup2'
In [72]:
%%time
%%capture logs
train.main_loop()
In [76]:
with open("/disk/scratch/neuroglycerin/logs/iterative_design_1.log","w") as f:
f.write(logs.stdout)
In [77]:
!cat /disk/scratch/neuroglycerin/logs/iterative_design_1.log | grep nll
So the NLL we're getting while training is massive, probably due to the the massive weights we're getting in the only RectifiedLinear layer. Opening the pickle to graph this:
In [82]:
model = pylearn2.utils.serial.load(
"/disk/scratch/neuroglycerin/models/iterative_design_recent.pkl")
In [83]:
import neukrill_net.plotting as pl
In [84]:
pl.monitor_channels(model, [c for c in model.monitor.channels if "mean_u" in c], x_axis="epoch")
Out[84]:
It's fairly clear the initial weights are much too high, probably because the every pixel in the 48 by 48 image (2304 elements) are connected to every element in the 1024 long hidden layer; and all of these weights are between -0.5 and 0.5. Decreasing this and running the model again.
In [93]:
main_mlp = pylearn2.models.mlp.MLP(
input_space=input_space,
batch_size=128,
layers=[pylearn2.models.mlp.RectifiedLinear(
dim=1024,
layer_name='h5',
istdev=0.05),
pylearn2.models.mlp.Softmax(
n_classes=121,
layer_name='y',
istdev=0.05
)])
In [94]:
algorithm = pylearn2.training_algorithms.sgd.SGD(
train_iteration_mode='even_shuffled_sequential',
monitor_iteration_mode='even_sequential',
batch_size=128,
learning_rate=0.1,
learning_rule= pylearn2.training_algorithms.learning_rule.Momentum(
init_momentum=0.5
),
monitoring_dataset={
'train':dataset,
'valid':neukrill_net.image_directory_dataset.ListDataset(
transformer=neukrill_net.augment.RandomAugment(
units='float',
rotate=[0,90,180,270],
rotate_is_resizable=0,
flip=1,
resize=final_shape,
normalise={'global_or_pixel':'global',
'mu': 0.957,
'sigma': 0.142}
),
settings_path=os.path.abspath("settings.json"),
run_settings_path=os.path.abspath("run_settings/replicate_8aug.json"),
force=True, training_set_mode='validation'
)
},
cost=pylearn2.costs.mlp.Default(),
termination_criterion=pylearn2.termination_criteria.EpochCounter(max_epochs=5)
)
In [95]:
train = pylearn2.train.Train(
dataset=dataset,
model=main_mlp,
algorithm=algorithm,
save_path='/disk/scratch/neuroglycerin/models/iterative_design_recent.pkl',
save_freq=1
)
In [107]:
%env THEANO_FLAGS='device=gpu2,floatX=float32,base_compiledir=~/.theano/stonesoup2'
In [97]:
%%time
%%capture logs
train.main_loop()
In [98]:
with open("/disk/scratch/neuroglycerin/logs/iterative_design_2.log","w") as f:
f.write(logs.stdout)
In [100]:
!cat /disk/scratch/neuroglycerin/logs/iterative_design_2.log | grep nll
In [116]:
!cat /disk/scratch/neuroglycerin/logs/iterative_design_2.log | grep mean_x_max_u
Looks like it's still a little high, reducing it further:
In [102]:
main_mlp = pylearn2.models.mlp.MLP(
input_space=input_space,
batch_size=128,
layers=[pylearn2.models.mlp.RectifiedLinear(
dim=1024,
layer_name='h5',
istdev=0.005),
pylearn2.models.mlp.Softmax(
n_classes=121,
layer_name='y',
istdev=0.005
)])
In [103]:
dataset = neukrill_net.image_directory_dataset.ListDataset(
transformer=neukrill_net.augment.RandomAugment(
units='float',
rotate=[0,90,180,270],
rotate_is_resizable=0,
flip=1,
resize=final_shape,
normalise={'global_or_pixel':'global',
'mu': 0.957,
'sigma': 0.142}
),
settings_path=os.path.abspath("settings.json"),
run_settings_path=os.path.abspath("run_settings/replicate_8aug.json"),
force=True, training_set_mode="test"
)
Running it again to compare against later, but with more epochs to see how this model proceeds:
In [105]:
algorithm = pylearn2.training_algorithms.sgd.SGD(
train_iteration_mode='even_shuffled_sequential',
monitor_iteration_mode='even_sequential',
batch_size=128,
learning_rate=0.1,
learning_rule= pylearn2.training_algorithms.learning_rule.Momentum(
init_momentum=0.5
),
monitoring_dataset={
'train':dataset,
'valid':neukrill_net.image_directory_dataset.ListDataset(
transformer=neukrill_net.augment.RandomAugment(
units='float',
rotate=[0,90,180,270],
rotate_is_resizable=0,
flip=1,
resize=final_shape,
normalise={'global_or_pixel':'global',
'mu': 0.957,
'sigma': 0.142}
),
settings_path=os.path.abspath("settings.json"),
run_settings_path=os.path.abspath("run_settings/replicate_8aug.json"),
force=True, training_set_mode='validation'
)
},
cost=pylearn2.costs.mlp.Default(),
termination_criterion=pylearn2.termination_criteria.EpochCounter(max_epochs=50)
)
In [106]:
train = pylearn2.train.Train(
dataset=dataset,
model=main_mlp,
algorithm=algorithm,
save_path='/disk/scratch/neuroglycerin/models/iterative_design_recent.pkl',
save_freq=1
)
In [108]:
%%time
%%capture logs
train.main_loop()
With the increased speed, we can see whether we overfit faster:
In [117]:
model = pylearn2.utils.serial.load(
"/disk/scratch/neuroglycerin/models/iterative_design_recent.pkl")
In [119]:
pl.monitor_channels(model, [c for c in model.monitor.channels if "nll" in c], x_axis="epoch")
Out[119]:
It looks like it's overfitting after about the 20th epoch, with the valid_y_nll
immediately starting to diverge. We should be able to deal with this by increasing the augmentation we're using; avoiding the cost of adding dropout (increases learning time, reduces flexibility).
Adding arbitrary rotations, shunts and scaling.
In [120]:
dataset = neukrill_net.image_directory_dataset.ListDataset(
transformer=neukrill_net.augment.RandomAugment(
units='float',
rotate=-1,
rotate_is_resizable=0,
shunt=0.05,
scale=0.05,
flip=1,
resize=final_shape,
normalise={'global_or_pixel':'global',
'mu': 0.957,
'sigma': 0.142}
),
settings_path=os.path.abspath("settings.json"),
run_settings_path=os.path.abspath("run_settings/replicate_8aug.json"),
force=True, training_set_mode="test"
)
In [124]:
main_mlp = pylearn2.models.mlp.MLP(
input_space=input_space,
batch_size=128,
layers=[pylearn2.models.mlp.RectifiedLinear(
dim=1024,
layer_name='h5',
istdev=0.005),
pylearn2.models.mlp.Softmax(
n_classes=121,
layer_name='y',
istdev=0.005
)])
Unfortunately, forgot to set up validation properly: not doing enough augmentation.
In [125]:
algorithm = pylearn2.training_algorithms.sgd.SGD(
train_iteration_mode='even_shuffled_sequential',
monitor_iteration_mode='even_sequential',
batch_size=128,
learning_rate=0.1,
learning_rule= pylearn2.training_algorithms.learning_rule.Momentum(
init_momentum=0.5
),
monitoring_dataset={
'train':dataset,
'valid':neukrill_net.image_directory_dataset.ListDataset(
transformer=neukrill_net.augment.RandomAugment(
units='float',
rotate=[0,90,180,270],
rotate_is_resizable=0,
flip=1,
resize=final_shape,
normalise={'global_or_pixel':'global',
'mu': 0.957,
'sigma': 0.142}
),
settings_path=os.path.abspath("settings.json"),
run_settings_path=os.path.abspath("run_settings/replicate_8aug.json"),
force=True, training_set_mode='validation'
)
},
cost=pylearn2.costs.mlp.Default(),
termination_criterion=pylearn2.termination_criteria.EpochCounter(max_epochs=50)
)
In [126]:
train = pylearn2.train.Train(
dataset=dataset,
model=main_mlp,
algorithm=algorithm,
save_path='/disk/scratch/neuroglycerin/models/iterative_design_recent.pkl',
save_freq=1
)
In [127]:
%%time
%%capture logs
train.main_loop()
In [139]:
model = pylearn2.utils.serial.load(
"/disk/scratch/neuroglycerin/models/iterative_design_recent.pkl")
In [130]:
pl.monitor_channels(model, [c for c in model.monitor.channels if "nll" in c], x_axis="epoch")
Out[130]:
Not overfitting yet, would like to load it and keep going.
In [131]:
import pylearn2.monitor
In [140]:
main_mlp = pylearn2.monitor.push_monitor(
model=model,
name="monitor_validation")
In [141]:
algorithm = pylearn2.training_algorithms.sgd.SGD(
train_iteration_mode='even_shuffled_sequential',
monitor_iteration_mode='even_sequential',
batch_size=128,
learning_rate=0.1,
learning_rule= pylearn2.training_algorithms.learning_rule.Momentum(
init_momentum=0.5
),
monitoring_dataset={
'train':dataset,
'valid':neukrill_net.image_directory_dataset.ListDataset(
transformer=neukrill_net.augment.RandomAugment(
units='float',
rotate=[0,90,180,270],
rotate_is_resizable=0,
flip=1,
resize=final_shape,
normalise={'global_or_pixel':'global',
'mu': 0.957,
'sigma': 0.142}
),
settings_path=os.path.abspath("settings.json"),
run_settings_path=os.path.abspath("run_settings/replicate_8aug.json"),
force=True, training_set_mode='validation'
)
},
cost=pylearn2.costs.mlp.Default(),
termination_criterion=pylearn2.termination_criteria.EpochCounter(max_epochs=50)
)
In [142]:
train = pylearn2.train.Train(
dataset=dataset,
model=main_mlp,
algorithm=algorithm,
save_path='/disk/scratch/neuroglycerin/models/continue_iterative_design_recent.pkl',
save_freq=1
)
In [143]:
%%time
%%capture logs
train.main_loop()
In [144]:
model = pylearn2.utils.serial.load(
"/disk/scratch/neuroglycerin/models/continue_iterative_design_recent.pkl")
In [145]:
pl.monitor_channels(model, [c for c in model.monitor.channels if "nll" in c], x_axis="epoch")
Out[145]:
So the training score continues to improve, but it's unable to improve the validation score. Although, the validation score isn't diverging like it was before. Updates seem very noisy, probably worth reducing the learning rate.
So we've been able to deal with overfitting by adding more augmentation so the model can't possibly learn the training set. We could add more layers and train for a very long time so that it might be able to learn everything we could possibly do to the image but an alternative would be to encode some of our priors about what images are like into the architecture of our model so that we can learn every possible image faster. That's basically the reason we should add a convolutional layer into our model.
Arbitrary picking 128 output channels on the intuition that if every class was just a simple shape the size of our kernel then this model would have no trouble finding every possible class every time. Just assuming a power of two is going to be a nice number. Some of the other parameters:
init_bias
- it's recommended to initialise the biases to zero.kernel_shape
- apparently a good value is 5 by 5...irange
- making this an arbitrary small number, close to the stdev of the other layers set above.pool_shape
and pool_stride
, aiming to set these in order to reduce the input space to the MLP layers to reasonable levels.
In [170]:
main_mlp = pylearn2.models.mlp.MLP(
input_space=input_space,
batch_size=128,
layers=[pylearn2.models.mlp.ConvRectifiedLinear(
layer_name="h1",
output_channels=128,
init_bias=0.0,
kernel_shape= [5,5],
irange=0.001,
pool_shape=[8,8],
pool_stride=[8,8]
),
pylearn2.models.mlp.RectifiedLinear(
dim=1024,
layer_name='h2',
istdev=0.005),
pylearn2.models.mlp.Softmax(
n_classes=121,
layer_name='y',
istdev=0.005
)])
Changed the above pool_stride
and pool_shape
until I got the following result, without so many parameters:
In [162]:
print(main_mlp)
Running this model over the same dataset again.
In [169]:
algorithm = pylearn2.training_algorithms.sgd.SGD(
train_iteration_mode='even_shuffled_sequential',
monitor_iteration_mode='even_sequential',
batch_size=128,
learning_rate=0.1,
learning_rule= pylearn2.training_algorithms.learning_rule.Momentum(
init_momentum=0.5
),
monitoring_dataset={
'train':dataset,
'valid':neukrill_net.image_directory_dataset.ListDataset(
transformer=neukrill_net.augment.RandomAugment(
units='float',
resize=final_shape,
normalise={'global_or_pixel':'global',
'mu': 0.957,
'sigma': 0.142}
),
settings_path=os.path.abspath("settings.json"),
run_settings_path=os.path.abspath("run_settings/replicate_8aug.json"),
force=True, training_set_mode='validation'
)
},
cost=pylearn2.costs.mlp.Default(),
termination_criterion=pylearn2.termination_criteria.EpochCounter(max_epochs=50)
)
In [171]:
train = pylearn2.train.Train(
dataset=dataset,
model=main_mlp,
algorithm=algorithm,
save_path='/disk/scratch/neuroglycerin/models/continue_iterative_design_recent.pkl',
save_freq=1
)
Started at 00:09:
In [173]:
%%time
%%capture logs
train.main_loop()
In [174]:
model = pylearn2.utils.serial.load(
"/disk/scratch/neuroglycerin/models/continue_iterative_design_recent.pkl")
In [175]:
pl.monitor_channels(model, [c for c in model.monitor.channels if "nll" in c], x_axis="epoch")
Out[175]:
Seems to be going well, but suspect it won't be able to overfit yet. Adding more convolutional layers to give it more of a chance, but to do that had to change pooling and kernel sizes carefully. Wrong settings quickly caused errors that were difficult to interpret.
In [202]:
main_mlp = pylearn2.models.mlp.MLP(
input_space=input_space,
batch_size=128,
layers=[pylearn2.models.mlp.ConvRectifiedLinear(
layer_name="h1",
output_channels=128,
init_bias=0.0,
kernel_shape= [4,4],
irange=0.001,
pool_shape=[6,6],
pool_stride=[4,4]
),
pylearn2.models.mlp.ConvRectifiedLinear(
layer_name="h2",
output_channels=128,
init_bias=0.0,
kernel_shape= [3,3],
irange=0.001,
pool_shape=[2,2],
pool_stride=[2,2]
),
pylearn2.models.mlp.RectifiedLinear(
dim=1024,
layer_name='h3',
istdev=0.005),
pylearn2.models.mlp.Softmax(
n_classes=121,
layer_name='y',
istdev=0.005
)])
In [203]:
print main_mlp
In [209]:
for l in main_mlp.get_params():
print(l.name,l.get_value().size)
In [206]:
l.get_value().size
Out[206]:
So the number of parameters between the MLP layer and the previous layer (used to be the input) has not changes much, only increased slightly. We can run this model for longer to make sure we see it overfit.
In [210]:
algorithm = pylearn2.training_algorithms.sgd.SGD(
train_iteration_mode='even_shuffled_sequential',
monitor_iteration_mode='even_sequential',
batch_size=128,
learning_rate=0.1,
learning_rule= pylearn2.training_algorithms.learning_rule.Momentum(
init_momentum=0.5
),
monitoring_dataset={
'train':dataset,
'valid':neukrill_net.image_directory_dataset.ListDataset(
transformer=neukrill_net.augment.RandomAugment(
units='float',
resize=final_shape,
normalise={'global_or_pixel':'global',
'mu': 0.957,
'sigma': 0.142}
),
settings_path=os.path.abspath("settings.json"),
run_settings_path=os.path.abspath("run_settings/replicate_8aug.json"),
force=True, training_set_mode='validation'
)
},
cost=pylearn2.costs.mlp.Default(),
termination_criterion=pylearn2.termination_criteria.EpochCounter(max_epochs=300)
)
In [211]:
train = pylearn2.train.Train(
dataset=dataset,
model=main_mlp,
algorithm=algorithm,
save_path='/disk/scratch/neuroglycerin/models/iterative_design_conv_recent.pkl',
save_freq=1
)
In [212]:
%%time
%%capture logs
train.main_loop()
In [214]:
with open("/disk/scratch/neuroglycerin/logs/iterative_design_conv.log","w") as f:
f.write(logs.stdout)
In [215]:
model = pylearn2.utils.serial.load(
"/disk/scratch/neuroglycerin/models/iterative_design_conv_recent.pkl")
In [216]:
pl.monitor_channels(model, [c for c in model.monitor.channels if "nll" in c], x_axis="epoch")
Out[216]:
Very clear signs of overitting, with the validation set diverging after about 100 epochs. Now all we need to do is check for divergence of kernel and col norms:
In [219]:
pl.monitor_channels(model, [c for c in model.monitor.channels if "norms_mean" in c], x_axis="epoch")
Out[219]:
Looks like the kernel means aren't saturating. The heuristic is to set the kernel norm max to 80% of the value this mean saturates at (although I think this should probably just be the 80% of the max); unfortunately, we've set the initialisation so low it's going to take a long time to reach that value.
Looking at the max values, to see if they sarturate at any point:
In [220]:
pl.monitor_channels(model, [c for c in model.monitor.channels if "norms_max" in c], x_axis="epoch")
Out[220]:
They don't but looking at these graphs the following model seems reasonable. Also increasing the initialisation weights as they appear to be too low; taking a long time to reach saturation value.
In [226]:
main_mlp = pylearn2.models.mlp.MLP(
input_space=input_space,
batch_size=128,
layers=[pylearn2.models.mlp.ConvRectifiedLinear(
layer_name="h1",
output_channels=128,
init_bias=0.0,
kernel_shape= [4,4],
irange=0.005,
max_kernel_norm=2.0,
pool_shape=[6,6],
pool_stride=[4,4]
),
pylearn2.models.mlp.ConvRectifiedLinear(
layer_name="h2",
output_channels=128,
init_bias=0.0,
max_kernel_norm=3.5,
kernel_shape= [3,3],
irange=0.005,
pool_shape=[2,2],
pool_stride=[2,2]
),
pylearn2.models.mlp.RectifiedLinear(
dim=1024,
layer_name='h3',
istdev=0.005,
max_col_norm=3.5),
pylearn2.models.mlp.Softmax(
n_classes=121,
max_col_norm=4.0,
layer_name='y',
istdev=0.005
)])
In [227]:
dataset = neukrill_net.image_directory_dataset.ListDataset(
transformer=neukrill_net.augment.RandomAugment(
units='float',
rotate=-1,
rotate_is_resizable=0,
shunt=0.05,
scale=0.05,
shear=5,
flip=1,
resize=final_shape,
normalise={'global_or_pixel':'global',
'mu': 0.957,
'sigma': 0.142}
),
settings_path=os.path.abspath("settings.json"),
run_settings_path=os.path.abspath("run_settings/replicate_8aug.json"),
force=True, training_set_mode="test"
)
In [228]:
algorithm = pylearn2.training_algorithms.sgd.SGD(
train_iteration_mode='even_shuffled_sequential',
monitor_iteration_mode='even_sequential',
batch_size=128,
learning_rate=0.1,
learning_rule= pylearn2.training_algorithms.learning_rule.Momentum(
init_momentum=0.5
),
monitoring_dataset={
'train':dataset,
'valid':neukrill_net.image_directory_dataset.ListDataset(
transformer=neukrill_net.augment.RandomAugment(
units='float',
resize=final_shape,
normalise={'global_or_pixel':'global',
'mu': 0.957,
'sigma': 0.142}
),
settings_path=os.path.abspath("settings.json"),
run_settings_path=os.path.abspath("run_settings/replicate_8aug.json"),
force=True, training_set_mode='validation'
)
},
cost=pylearn2.costs.mlp.Default(),
termination_criterion=pylearn2.termination_criteria.EpochCounter(max_epochs=150)
)
In [229]:
train = pylearn2.train.Train(
dataset=dataset,
model=main_mlp,
algorithm=algorithm,
save_path='/disk/scratch/neuroglycerin/models/iterative_design_conv_recent.pkl',
save_freq=1
)
Started at 13:11.
In [230]:
%%time
%%capture logs
train.main_loop()
In [232]:
model = pylearn2.utils.serial.load(
"/disk/scratch/neuroglycerin/models/iterative_design_conv_recent.pkl")
In [233]:
pl.monitor_channels(model, [c for c in model.monitor.channels if "nll" in c], x_axis="epoch")
Out[233]:
In [236]:
min(model.monitor.channels['valid_y_nll'].val_record)
Out[236]:
Serious overfitting problem appears to be fixed. Also not yet seeing any problem with norm saturation. Final score on validation set is also pretty respectable for a model trained on 10% of the data: 1.73.
In [237]:
pl.monitor_channels(model, [c for c in model.monitor.channels if "norms_max" in c], x_axis="epoch")
Out[237]:
If we're not overfitting yet it's because our model still isn't flexible enough. So we're just going to increase the number of layers in our model. Adding another convolutional and MLP layer. The aim is to include enough hidden units/layers to have enough flexibility but not more, because we'll start to hurt the training time.
In [244]:
main_mlp = pylearn2.models.mlp.MLP(
input_space=input_space,
batch_size=128,
layers=[pylearn2.models.mlp.ConvRectifiedLinear(
layer_name='h1',
output_channels=128,
init_bias=0.0,
kernel_shape= [4,4],
irange=0.005,
max_kernel_norm=2.0,
pool_shape=[6,6],
pool_stride=[4,4]
),
pylearn2.models.mlp.ConvRectifiedLinear(
layer_name='h2',
output_channels=128,
init_bias=0.0,
max_kernel_norm=3.5,
kernel_shape= [3,3],
irange=0.005,
pool_shape=[2,2],
pool_stride=[2,2]
),
pylearn2.models.mlp.ConvRectifiedLinear(
layer_name='h3',
output_channels=128,
init_bias=0.0,
max_kernel_norm=3.5,
kernel_shape= [3,3],
irange=0.005,
pool_shape=[2,2],
pool_stride=[2,2]
),
pylearn2.models.mlp.RectifiedLinear(
dim=1024,
layer_name='h4',
istdev=0.005,
max_col_norm=3.5),
pylearn2.models.mlp.RectifiedLinear(
dim=1024,
layer_name='h5',
istdev=0.005,
max_col_norm=3.5),
pylearn2.models.mlp.Softmax(
n_classes=121,
max_col_norm=4.0,
layer_name='y',
istdev=0.005
)])
Dimensions still look good.
In [240]:
print main_mlp
In [241]:
for l in main_mlp.get_params():
print(l.name,l.get_value().size)
Increasing training time to 200 epochs as anticipating that it could take longer to converge with the extra parameters.
In [245]:
algorithm = pylearn2.training_algorithms.sgd.SGD(
train_iteration_mode='even_shuffled_sequential',
monitor_iteration_mode='even_sequential',
batch_size=128,
learning_rate=0.1,
learning_rule= pylearn2.training_algorithms.learning_rule.Momentum(
init_momentum=0.5
),
monitoring_dataset={
'train':dataset,
'valid':neukrill_net.image_directory_dataset.ListDataset(
transformer=neukrill_net.augment.RandomAugment(
units='float',
resize=final_shape,
normalise={'global_or_pixel':'global',
'mu': 0.957,
'sigma': 0.142}
),
settings_path=os.path.abspath("settings.json"),
run_settings_path=os.path.abspath("run_settings/replicate_8aug.json"),
force=True, training_set_mode='validation'
)
},
cost=pylearn2.costs.mlp.Default(),
termination_criterion=pylearn2.termination_criteria.EpochCounter(max_epochs=200)
)
In [246]:
train = pylearn2.train.Train(
dataset=dataset,
model=main_mlp,
algorithm=algorithm,
save_path='/disk/scratch/neuroglycerin/models/iterative_design_conv_recent.pkl',
save_freq=1
)
Started at 14:54.
In [247]:
%%time
%%capture logs
train.main_loop()
In [250]:
model = pylearn2.utils.serial.load(
"/disk/scratch/neuroglycerin/models/iterative_design_conv_recent.pkl")
In [252]:
pl.monitor_channels(model, [c for c in model.monitor.channels if "nll" in c], x_axis="epoch")
Out[252]:
In [251]:
pl.monitor_channels(model, [c for c in model.monitor.channels if "norms_max" in c], x_axis="epoch")
Out[251]:
Looks like it's having trouble getting started. The weights are growing slowly in the output layers, and until that happens I suppose it's difficult for the gradient updates to reach the earlier layers; at that point though, it does start to improve faster. Suggests we should increase the initialisation of the weights.
This may be a stupid idea, but if I can set the norm of these weight matrices to intialise in the "active" region seen after about 120 epochs in most of these graphs it might work better. Given an initialisation range, we can estimate what value the norms are likely to have so I can just run a loop to figure out what a good initialisation might be:
In [259]:
# for the first layer 4,4,128
maxnorm = 0
irange = 0.005
while maxnorm < 0.2:
W = ((np.random.rand(100,4,4,128)*2)-1)*irange
maxnorm = np.max(np.sqrt(np.sum(W**2,axis=(1,2,3))))
irange = irange*1.001
print("irange should be approximately {0}".format(irange))
In [260]:
# second layer and third layer
maxnorm = 0
irange = 0.005
while maxnorm < 0.2:
W = ((np.random.rand(100,3,3,128)*2)-1)*irange
maxnorm = np.max(np.sqrt(np.sum(W**2,axis=(1,2,3))))
irange = irange*1.001
print("irange should be approximately {0}".format(irange))
In [265]:
# MLP layer 1
maxnorm = 0
istdev = 0.001
while maxnorm < 0.3:
W = np.random.randn(1024,512)*istdev
maxnorm = np.mean(np.sqrt(np.sum(W**2,axis=1)))
istdev = istdev*1.1
print("istdev should be approximately {0}".format(istdev))
In [266]:
# MLP layer 2
maxnorm = 0
istdev = 0.001
while maxnorm < 0.3:
W = np.random.randn(1024,1024)*istdev
maxnorm = np.mean(np.sqrt(np.sum(W**2,axis=1)))
istdev = istdev*1.1
print("istdev should be approximately {0}".format(istdev))
In [268]:
# output layer
maxnorm = 0
istdev = 0.0001
while maxnorm < 0.3:
W = np.random.randn(121,1024)*istdev
maxnorm = np.mean(np.sqrt(np.sum(W**2,axis=1)))
istdev = istdev*1.1
print("istdev should be approximately {0}".format(istdev))
Hopefully those are approximately correct. Settings these parameters in the model and running for a short number of epochs.
In [269]:
main_mlp = pylearn2.models.mlp.MLP(
input_space=input_space,
batch_size=128,
layers=[pylearn2.models.mlp.ConvRectifiedLinear(
layer_name='h1',
output_channels=128,
init_bias=0.0,
kernel_shape= [4,4],
irange=0.0074,
max_kernel_norm=2.0,
pool_shape=[6,6],
pool_stride=[4,4]
),
pylearn2.models.mlp.ConvRectifiedLinear(
layer_name='h2',
output_channels=128,
init_bias=0.0,
max_kernel_norm=3.5,
kernel_shape= [3,3],
irange=0.0098,
pool_shape=[2,2],
pool_stride=[2,2]
),
pylearn2.models.mlp.ConvRectifiedLinear(
layer_name='h3',
output_channels=128,
init_bias=0.0,
max_kernel_norm=3.5,
kernel_shape= [3,3],
irange=0.0098,
pool_shape=[2,2],
pool_stride=[2,2]
),
pylearn2.models.mlp.RectifiedLinear(
dim=1024,
layer_name='h4',
istdev=0.0158,
max_col_norm=3.5),
pylearn2.models.mlp.RectifiedLinear(
dim=1024,
layer_name='h5',
istdev=0.0108,
max_col_norm=3.5),
pylearn2.models.mlp.Softmax(
n_classes=121,
max_col_norm=4.0,
layer_name='y',
istdev=0.0107
)])
In [270]:
algorithm = pylearn2.training_algorithms.sgd.SGD(
train_iteration_mode='even_shuffled_sequential',
monitor_iteration_mode='even_sequential',
batch_size=128,
learning_rate=0.1,
learning_rule= pylearn2.training_algorithms.learning_rule.Momentum(
init_momentum=0.5
),
monitoring_dataset={
'train':dataset,
'valid':neukrill_net.image_directory_dataset.ListDataset(
transformer=neukrill_net.augment.RandomAugment(
units='float',
resize=final_shape,
normalise={'global_or_pixel':'global',
'mu': 0.957,
'sigma': 0.142}
),
settings_path=os.path.abspath("settings.json"),
run_settings_path=os.path.abspath("run_settings/replicate_8aug.json"),
force=True, training_set_mode='validation'
)
},
cost=pylearn2.costs.mlp.Default(),
termination_criterion=pylearn2.termination_criteria.EpochCounter(max_epochs=30)
)
In [271]:
train = pylearn2.train.Train(
dataset=dataset,
model=main_mlp,
algorithm=algorithm,
save_path='/disk/scratch/neuroglycerin/models/iterative_design_conv_recent.pkl',
save_freq=1
)
In [272]:
%%time
%%capture logs
train.main_loop()
In [273]:
model = pylearn2.utils.serial.load(
"/disk/scratch/neuroglycerin/models/iterative_design_conv_recent.pkl")
In [274]:
pl.monitor_channels(model, [c for c in model.monitor.channels if "nll" in c], x_axis="epoch")
Out[274]:
Looks like we've probably fixed the problem of getting stuck at 4 we saw before. Unclear if we're going to run into new problems with norm saturation but going to continue blindly.
Before starting another long run to see if this model will overfit and what score it's likely to get we can consider what we're doing with the learning rate. The learning rate appears to be quite high at the moment, looking at the step size we're getting every epoch. Could be a good idea to stop once the validation set stops improving (early stopping). Changing the termination criterion to check for improvement on the validation set and stop when it stops improving over the last 5 epochs.
In [277]:
algorithm = pylearn2.training_algorithms.sgd.SGD(
train_iteration_mode='even_shuffled_sequential',
monitor_iteration_mode='even_sequential',
batch_size=128,
learning_rate=0.1,
learning_rule= pylearn2.training_algorithms.learning_rule.Momentum(
init_momentum=0.5
),
monitoring_dataset={
'train':dataset,
'valid':neukrill_net.image_directory_dataset.ListDataset(
transformer=neukrill_net.augment.RandomAugment(
units='float',
resize=final_shape,
normalise={'global_or_pixel':'global',
'mu': 0.957,
'sigma': 0.142}
),
settings_path=os.path.abspath("settings.json"),
run_settings_path=os.path.abspath("run_settings/replicate_8aug.json"),
force=True, training_set_mode='validation'
)
},
cost=pylearn2.costs.mlp.Default(),
termination_criterion=pylearn2.termination_criteria.MonitorBased(
prop_decrease=0.01, N=5, channel_name='valid_y_nll')
)
In [278]:
main_mlp = pylearn2.models.mlp.MLP(
input_space=input_space,
batch_size=128,
layers=[pylearn2.models.mlp.ConvRectifiedLinear(
layer_name='h1',
output_channels=128,
init_bias=0.0,
kernel_shape= [4,4],
irange=0.0074,
max_kernel_norm=2.0,
pool_shape=[6,6],
pool_stride=[4,4]
),
pylearn2.models.mlp.ConvRectifiedLinear(
layer_name='h2',
output_channels=128,
init_bias=0.0,
max_kernel_norm=3.5,
kernel_shape= [3,3],
irange=0.0098,
pool_shape=[2,2],
pool_stride=[2,2]
),
pylearn2.models.mlp.ConvRectifiedLinear(
layer_name='h3',
output_channels=128,
init_bias=0.0,
max_kernel_norm=3.5,
kernel_shape= [3,3],
irange=0.0098,
pool_shape=[2,2],
pool_stride=[2,2]
),
pylearn2.models.mlp.RectifiedLinear(
dim=1024,
layer_name='h4',
istdev=0.0158,
max_col_norm=3.5),
pylearn2.models.mlp.RectifiedLinear(
dim=1024,
layer_name='h5',
istdev=0.0108,
max_col_norm=3.5),
pylearn2.models.mlp.Softmax(
n_classes=121,
max_col_norm=4.0,
layer_name='y',
istdev=0.0107
)])
In [279]:
train = pylearn2.train.Train(
dataset=dataset,
model=main_mlp,
algorithm=algorithm,
save_path='/disk/scratch/neuroglycerin/models/iterative_design_conv_early_recent.pkl',
save_freq=1
)
In [280]:
%%time
%%capture logs
train.main_loop()
In [281]:
model = pylearn2.utils.serial.load(
'/disk/scratch/neuroglycerin/models/iterative_design_conv_early_recent.pkl')
In [282]:
pl.monitor_channels(model, [c for c in model.monitor.channels if "nll" in c], x_axis="epoch")
Out[282]:
Looks like I didn't set the number of epochs on that monitor based termination criterion properly. Should graph the value it's watching to get a better idea what to set it to.
In [289]:
c = np.array(model.monitor.channels['valid_y_nll'].val_record)
In [319]:
p = []
N = 5
for i,x in enumerate(c[:-N]):
l = min(c[i:i+N])
p.append((x-l)/x)
plt.plot(p)
Out[319]:
So it checks to see if it has any of the epochs in the last N have improved on the score N epochs ago. If I'd set this to 10 it wouldn't have tripped the condition:
In [321]:
p = []
N = 10
for i,x in enumerate(c[:-N]):
l = min(c[i:i+N])
p.append((x-l)/x)
plt.plot(p)
Out[321]:
To be safe, going to set this to 30.
In [322]:
algorithm = pylearn2.training_algorithms.sgd.SGD(
train_iteration_mode='even_shuffled_sequential',
monitor_iteration_mode='even_sequential',
batch_size=128,
learning_rate=0.1,
learning_rule= pylearn2.training_algorithms.learning_rule.Momentum(
init_momentum=0.5
),
monitoring_dataset={
'train':dataset,
'valid':neukrill_net.image_directory_dataset.ListDataset(
transformer=neukrill_net.augment.RandomAugment(
units='float',
resize=final_shape,
normalise={'global_or_pixel':'global',
'mu': 0.957,
'sigma': 0.142}
),
settings_path=os.path.abspath("settings.json"),
run_settings_path=os.path.abspath("run_settings/replicate_8aug.json"),
force=True, training_set_mode='validation'
)
},
cost=pylearn2.costs.mlp.Default(),
termination_criterion=pylearn2.termination_criteria.MonitorBased(
prop_decrease=0.01, N=30, channel_name='valid_y_nll')
)
In [323]:
main_mlp = pylearn2.models.mlp.MLP(
input_space=input_space,
batch_size=128,
layers=[pylearn2.models.mlp.ConvRectifiedLinear(
layer_name='h1',
output_channels=128,
init_bias=0.0,
kernel_shape= [4,4],
irange=0.0074,
max_kernel_norm=2.0,
pool_shape=[6,6],
pool_stride=[4,4]
),
pylearn2.models.mlp.ConvRectifiedLinear(
layer_name='h2',
output_channels=128,
init_bias=0.0,
max_kernel_norm=3.5,
kernel_shape= [3,3],
irange=0.0098,
pool_shape=[2,2],
pool_stride=[2,2]
),
pylearn2.models.mlp.ConvRectifiedLinear(
layer_name='h3',
output_channels=128,
init_bias=0.0,
max_kernel_norm=3.5,
kernel_shape= [3,3],
irange=0.0098,
pool_shape=[2,2],
pool_stride=[2,2]
),
pylearn2.models.mlp.RectifiedLinear(
dim=1024,
layer_name='h4',
istdev=0.0158,
max_col_norm=3.5),
pylearn2.models.mlp.RectifiedLinear(
dim=1024,
layer_name='h5',
istdev=0.0108,
max_col_norm=3.5),
pylearn2.models.mlp.Softmax(
n_classes=121,
max_col_norm=4.0,
layer_name='y',
istdev=0.0107
)])
In [324]:
train = pylearn2.train.Train(
dataset=dataset,
model=main_mlp,
algorithm=algorithm,
save_path='/disk/scratch/neuroglycerin/models/iterative_design_conv_early_recent.pkl',
save_freq=1
)
In [325]:
%%time
%%capture logs
train.main_loop()
In [376]:
model = pylearn2.utils.serial.load(
'/disk/scratch/neuroglycerin/models/iterative_design_conv_early_recent.pkl')
In [327]:
pl.monitor_channels(model, [c for c in model.monitor.channels if "nll" in c], x_axis="epoch")
Out[327]:
It looks like it was beginning to overfit when the termination criterion triggered, so it performed as expected. The minimum score we're getting on validation is now:
In [331]:
float(min(model.monitor.channels['valid_y_nll'].val_record))
Out[331]:
That's quite impressive for a model that's only seen 10% of the training set. Seems like it might be worth seeing what this model can do on the full training set overnight (it's getting late). Doing that next.
Might also be worth looking at the weights this model is able to learn.
In [378]:
w = pl.model_weights(model)
In [381]:
w.cols(10)
Out[381]:
It's definitely learning some of the weights, but a large proportion of the kernels are remaining pretty much blank.
The only thing that I'm going to change when writing the YAML is to add in a LR adjuster as it seems like Matt has got it working quite well; and the learning rate this model is using is likely much too high once it settles into a good local optimum. Also, left in the momentum schedule, but turned it down slightly.
In [356]:
%%writefile yaml_templates/interactive_iterative_version1.yaml
!obj:pylearn2.train.Train {
dataset: &train !obj:neukrill_net.image_directory_dataset.ListDataset {
transformer: !obj:neukrill_net.augment.RandomAugment {
units: 'float',
rotate: -1,
rotate_is_resizable: 0,
flip: 1,
shunt: 0.05,
scale: 0.05,
shear: 5,
resize: %(final_shape)s,
normalise: {global_or_pixel: 'global',
mu: %(mu)s,
sigma: %(sigma)s}
},
settings_path: %(settings_path)s,
run_settings_path: %(run_settings_path)s
},
model: !obj:pylearn2.models.mlp.MLP {
batch_size: &batch_size 128,
input_space: !obj:pylearn2.space.Conv2DSpace {
shape: %(final_shape)s,
num_channels: 1,
axes: ['b', 0, 1, 'c'],
},
layers: [!obj:pylearn2.models.mlp.ConvRectifiedLinear {
layer_name: 'h1',
output_channels: 128,
init_bias: 0.0,
kernel_shape: [4,4],
irange: 0.0074,
max_kernel_norm: 2.0,
pool_shape: [6,6],
pool_stride: [4,4]
},
!obj:pylearn2.models.mlp.ConvRectifiedLinear {
layer_name: 'h2',
output_channels: 128,
init_bias: 0.0,
max_kernel_norm: 3.5,
kernel_shape: [3,3],
irange: 0.0098,
pool_shape: [2,2],
pool_stride: [2,2]
},
!obj:pylearn2.models.mlp.ConvRectifiedLinear {
layer_name: 'h3',
output_channels: 128,
init_bias: 0.0,
max_kernel_norm: 3.5,
kernel_shape: [3,3],
irange: 0.0098,
pool_shape: [2,2],
pool_stride: [2,2]
},
!obj:pylearn2.models.mlp.RectifiedLinear {
dim: 1024,
layer_name: 'h4',
istdev: 0.0158,
max_col_norm: 3.5},
!obj:pylearn2.models.mlp.RectifiedLinear {
dim: 1024,
layer_name: 'h5',
istdev: 0.0108,
max_col_norm: 3.5},
!obj:pylearn2.models.mlp.Softmax {
n_classes: 121,
max_col_norm: 4.0,
layer_name: 'y',
istdev: 0.0107
}]
},
algorithm: !obj:pylearn2.training_algorithms.sgd.SGD {
train_iteration_mode: even_shuffled_sequential,
monitor_iteration_mode: even_sequential,
batch_size: *batch_size,
learning_rate: 0.1,
learning_rule: !obj:neukrill_net.update_norm_monitor.UpdateNormMonitorLearningRule {
base_learning_rule: !obj:pylearn2.training_algorithms.learning_rule.Momentum {
init_momentum: 0.5
},
decay: 0.9,
},
monitoring_dataset: {
'train': *train,
'valid' : !obj:neukrill_net.image_directory_dataset.ListDataset {
transformer: !obj:neukrill_net.augment.RandomAugment {
resize: %(final_shape)s,
units: 'float',
normalise: {global_or_pixel: 'global',
mu: %(mu)s,
sigma: %(sigma)s}
},
settings_path: %(settings_path)s,
run_settings_path: %(run_settings_path)s,
training_set_mode: "validation"
}
},
cost: !obj:pylearn2.costs.mlp.Default {},
termination_criterion: !obj:pylearn2.termination_criteria.MonitorBased {
prop_decrease: 0.01,
N: 30,
channel_name: 'valid_y_nll'},
},
extensions: [
!obj:pylearn2.training_algorithms.learning_rule.MomentumAdjustor {
start: 1,
saturate: 200,
final_momentum: 0.5
},
!obj:pylearn2.train_extensions.best_params.MonitorBasedSaveBest {
channel_name: valid_y_nll,
save_path: '%(save_path)s'
},
!obj:pylearn2.training_algorithms.sgd.MonitorBasedLRAdjuster {
high_trigger: 1.,
low_trigger: 0.999,
grow_amt: 1.01,
shrink_amt: 0.99,
max_lr: 1.,
min_lr: 1e-5,
channel_name: valid_y_nll
}
],
save_path: '%(alt_picklepath)s',
save_freq: 1
}
In [357]:
%%writefile run_settings/interactive_iterative_version1.json
{
"final_shape":[
48,
48
],
"model type":"pylearn2",
"preprocessing":{
"normalise":{
"global_or_pixel":"global",
"mu":0.9572712779045105,
"sigma":0.14225326478481293
},
"resize":[
48,
48
]
},
"train_split":0.8,
"yaml file":"interactive_iterative_version1.yaml"
}
Then I run the following command outside the notebook (because background processes apparently aren't supported easily - don't want to use IPython.parallel
).
!python train.py run_settings/interactive_iterative_version1.json >
/disk/scratch/neuroglycerin/logs/interactive_iterative_version1.log
Anyway, now that it's running we can tail the logs.
In [408]:
!tail -n 20 /disk/scratch/neuroglycerin/logs/interactive_iterative_version1.log
And look at the plots from the recent pickle file to check for problems:
In [402]:
model = pylearn2.utils.serial.load(
'/disk/scratch/neuroglycerin/models/interactive_iterative_version1_recent.pkl')
In [403]:
pl.monitor_channels(model, [c for c in model.monitor.channels if "nll" in c], x_axis="epoch")
Out[403]:
In [404]:
curves = pl.monitor_channels(model, model.monitor.channels, x_axis='epoch')
In [405]:
curves
Out[405]:
Looking at the weights learned:
In [409]:
w = pl.model_weights(model)
In [410]:
w.cols(10)
Out[410]:
So some of the filters are becoming more defined, but there are still a large proportion of blank ones.
Without dropout, it looks like we're going to overfit on the training set, even with this much augmentation. So we should use dropout, but we should also make sure our model is deep enough. It also could be that the maxout being applied is too much, and we're not able to propagate gradients back to all of the filters in the first layer. Could be better to use fewer filters in the early layers, but make them larger.
Also, looking at the top scoring results for CIFAR-100 it could be worthwhile using Maxout layers instead of normal rectified linear layers. Looking at all of this, seems like it might be worth giving the Maxout network from the original paper another shot.
In [445]:
main_mlp = pylearn2.models.mlp.MLP(
input_space=input_space,
batch_size=128,
layers=[pylearn2.models.maxout.MaxoutConvC01B(
layer_name='h1',
pad=4,
tied_b=1,
W_lr_scale=0.05,
b_lr_scale=0.05,
num_channels=96,
num_pieces=2,
kernel_shape=[8,8],
pool_shape=[5,5],
pool_stride=[4,4],
init_bias=0.0,
irange=0.005,
max_kernel_norm=2.0,
partial_sum=49
),
pylearn2.models.maxout.MaxoutConvC01B(
layer_name='h2',
pad=3,
tied_b=1,
W_lr_scale=0.05,
b_lr_scale=0.05,
num_channels=192,
num_pieces=2,
kernel_shape= [8,8],
pool_shape=[4,4],
pool_stride=[2,2],
init_bias=0.0,
irange=0.0098,
max_kernel_norm=3.5,
partial_sum=11
),
pylearn2.models.maxout.MaxoutConvC01B(
pad=3,
layer_name='h3',
W_lr_scale=0.05,
b_lr_scale=0.05,
num_channels=192,
num_pieces=2,
kernel_shape= [5,5],
pool_shape=[2,2],
pool_stride=[2,2],
init_bias=0.0,
irange=0.0098,
max_kernel_norm=3.5
),
pylearn2.models.maxout.Maxout(
layer_name='h4',
irange=0.005,
num_units=500,
num_pieces=5,
max_col_norm=3.5),
pylearn2.models.mlp.Softmax(
n_classes=121,
max_col_norm=4.0,
layer_name='y',
irange=0.005
)])
Results in quite a nice model shape; does not appear to be too many parameters:
In [446]:
print main_mlp
Testing this on the smaller training set as before, but adding dropout as in the Maxout paper.
In [447]:
algorithm = pylearn2.training_algorithms.sgd.SGD(
train_iteration_mode='even_shuffled_sequential',
monitor_iteration_mode='even_sequential',
batch_size=128,
learning_rate=0.1,
learning_rule= pylearn2.training_algorithms.learning_rule.Momentum(
init_momentum=0.5
),
monitoring_dataset={
'train':dataset,
'valid':neukrill_net.image_directory_dataset.ListDataset(
transformer=neukrill_net.augment.RandomAugment(
units='float',
resize=final_shape,
normalise={'global_or_pixel':'global',
'mu': 0.957,
'sigma': 0.142}
),
settings_path=os.path.abspath("settings.json"),
run_settings_path=os.path.abspath("run_settings/replicate_8aug.json"),
force=True, training_set_mode='validation'
)
},
cost=pylearn2.costs.mlp.dropout.Dropout(
input_include_probs={"h1":0.8},
input_scales={"h1":1.}
),
termination_criterion=pylearn2.termination_criteria.MonitorBased(
prop_decrease=0.001, N=100, channel_name='valid_y_nll')
)
In [448]:
train = pylearn2.train.Train(
dataset=dataset,
model=main_mlp,
algorithm=algorithm,
save_path='/disk/scratch/neuroglycerin/models/iterative_design_maxout_recent.pkl',
save_freq=1
)
In [449]:
%%time
%%capture logs
train.main_loop()
In [450]:
model = pylearn2.utils.serial.load(
'/disk/scratch/neuroglycerin/models/iterative_design_maxout_recent.pkl')
In [451]:
pl.monitor_channels(model, [c for c in model.monitor.channels if "nll" in c], x_axis="epoch")
Out[451]:
Can clearly see that that it's much noisier with maxout. Probably the learning rate is much too high, and it requires some kind of decay schedule. Probably requires quite a few modifications before it's practical.